{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict, Counter\n",
    "import json\n",
    "import pandas as pd\n",
    "import time\n",
    "from datetime import datetime"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Behance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bring the dataset\n",
    "data = pd.read_csv(\"raw_dataset/Behance_appreciate_1M\", sep=\" \", header=None, names=['userId', 'itemId', 'time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count the number of users and items\n",
    "user_count = defaultdict(lambda: 0)\n",
    "item_count = defaultdict(lambda: 0)\n",
    "for k, v in data.iterrows():\n",
    "    user_count[v['userId']] += 1\n",
    "    item_count[v['itemId']] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove the # user < 5 and # item < 5\n",
    "usermap = dict()\n",
    "usernum = 0\n",
    "itemmap = dict()\n",
    "itemnum = 0\n",
    "User = dict()\n",
    "for k,v in data.iterrows():\n",
    "    session_id = v['userId']; item = v['itemId']; time_ = v['time']\n",
    "    if user_count[session_id] < 5 or item_count[item] < 5:\n",
    "        continue\n",
    "    \n",
    "    if session_id in usermap:\n",
    "        userid = usermap[session_id]\n",
    "    else:\n",
    "        usernum += 1\n",
    "        userid = usernum\n",
    "        usermap[session_id] = userid\n",
    "        User[userid] = []\n",
    "    if item in itemmap:\n",
    "        itemid = itemmap[item]\n",
    "    else:\n",
    "        itemnum += 1\n",
    "        itemid = itemnum\n",
    "        itemmap[item] = itemid\n",
    "    User[userid].append([time_, itemid])\n",
    "    \n",
    "# Sort the sequences\n",
    "for userid in User.keys():\n",
    "    User[userid].sort(key=lambda d:d[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save\n",
    "f = open(f\"dataset/Behance.txt\", 'w')\n",
    "for user in User.keys():\n",
    "    for i in User[user]:\n",
    "        f.write('%d %d %d\\n' % (user, i[1], i[0]))\n",
    "f.close()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## FSQ"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bring the dataset and Count the number of users and items\n",
    "rating_info = defaultdict(list)\n",
    "user_count = defaultdict(int)\n",
    "item_count = defaultdict(int)\n",
    "with open(\"raw_dataset/fsq_checkins.dat\", \"r\") as f:\n",
    "    data = f.readlines()\n",
    "    for i in data:\n",
    "        i = i.split('|')\n",
    "        user_id = int(i[1].strip())\n",
    "        user_count[user_id] += 1\n",
    "        item_id = int(i[2].strip())\n",
    "        item_count[item_id] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove the interactions of # user < 5 and # item < 5\n",
    "user_map = {}\n",
    "user_num = 0\n",
    "item_map = {}\n",
    "item_num = 0\n",
    "with open(\"fsq_checkins.dat\", \"r\") as f:\n",
    "    data = f.readlines()\n",
    "    for i in data:\n",
    "        i = i.split('|')\n",
    "        user_id = int(i[1].strip())\n",
    "        item_id = int(i[2].strip())\n",
    "        if user_count[user_id] < 5 or item_count[item_id] < 5:\n",
    "            continue\n",
    "        if user_id not in user_map.keys():\n",
    "            user_num += 1\n",
    "            user_map[user_id] = user_num\n",
    "        if item_id not in item_map.keys():\n",
    "            item_num += 1\n",
    "            item_map[item_id] = item_num \n",
    "        s_time = i[5]\n",
    "        year = int(s_time[:5])\n",
    "        month = int(s_time[6:8])\n",
    "        day = int(s_time[9:11])\n",
    "        hour = int(s_time[12:14])\n",
    "        minute = int(s_time[15:17])\n",
    "        timestamp = time.mktime(datetime.strptime(f\"{year}/{month}/{day}/{hour}/{minute}\", \"%Y/%m/%d/%H/%M\").timetuple())\n",
    "        rating_info[user_map[user_id]].append((item_map[item_id], timestamp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "for k,v in rating_info.items():\n",
    "    sort_rating = sorted(v, key=lambda b:b[1])\n",
    "    rating_info[k] = sort_rating\n",
    "    \n",
    "f = open(f\"dataset/fsq.txt\", 'w')\n",
    "for user in rating_info.keys():\n",
    "    for i in rating_info[user]:\n",
    "        f.write('%d %d %d\\n' % (user, i[0], i[1]))\n",
    "f.close()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Amazon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "def parse(path):\n",
    "    g = gzip.open(path, 'r')\n",
    "    for l in g:\n",
    "        yield json.loads(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = \"reviews_Digital_Music.json.gz\"\n",
    "dataset_save_name = \"Music\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "countU = defaultdict(lambda: 0)\n",
    "countP = defaultdict(lambda: 0)\n",
    "line = 0\n",
    "for l in parse(f\"raw_dataset/{dataset_name}\"):\n",
    "    line += 1\n",
    "    asin = l['asin']\n",
    "    rev = l['reviewerID']\n",
    "    time = l['unixReviewTime']\n",
    "    countU[rev] += 1\n",
    "    countP[asin] += 1\n",
    "\n",
    "usermap = dict()\n",
    "usernum = 0\n",
    "itemmap = dict()\n",
    "itemnum = 0\n",
    "User = dict()\n",
    " \n",
    "for l in parse(f\"raw_dataset/{dataset_name}\"):\n",
    "    line += 1\n",
    "    asin = l['asin']\n",
    "    rev = l['reviewerID']\n",
    "    time = l['unixReviewTime']\n",
    "    if countU[rev] < 5 or countP[asin] < 5:\n",
    "        continue\n",
    "\n",
    "    if rev in usermap:\n",
    "        userid = usermap[rev]\n",
    "    else:\n",
    "        usernum += 1\n",
    "        userid = usernum\n",
    "        usermap[rev] = userid\n",
    "        User[userid] = []\n",
    "    if asin in itemmap:\n",
    "        itemid = itemmap[asin]\n",
    "    else:\n",
    "        itemnum += 1\n",
    "        itemid = itemnum\n",
    "        itemmap[asin] = itemid\n",
    "    User[userid].append([time, itemid])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort the sqeuences\n",
    "for userid in User.keys():\n",
    "    User[userid].sort(key=lambda x: x[0])\n",
    "    \n",
    "f = open(f\"dataset/{dataset_save_name}.txt\", 'w')\n",
    "for user in User.keys():\n",
    "    for i in User[user]:\n",
    "        f.write('%d %d %d\\n' % (user, i[1], i[0]))\n",
    "f.close()"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "36c0ccfdaac910ef6f054bea2be285381f9c572f47ba89b1068ec658452dd66b"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('kibum')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
